#delimit;
set more off;
set logtype text;
capture log close hourly_to_weekly_traffic;
pause on;

clear all;

** REPLACE FILE PATH WITH PATH TO RELEVANT REPLICATION FILES;
local fileloc = "~/KMS_REPLICATION";

log using `fileloc'/log_files/hourly_to_weekly_traffic.txt, replace name(hourly_to_weekly_traffic);

**XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX;
**XXXXXXXXXXXX AGGREGATE REGION DATA INTO ONE  WEEKLY DATA SET XXXXXXXXXXX;
**XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX; 

tempfile temp03 temp04 temp07 temp08 temp11 temp12;

** Included regions (see text);
foreach region in 03 04 07 08 11 12 {;
	
	clear;
	
	foreach year in 2002 2003 2004 2005 2006 2007 {;

		foreach month in 01 02 03 04 05 06 07 08 09 10 11 12 {;
			append using `fileloc'/data/traffic_data/dta_files/d`region'_`year'_`month'.dta;
		};
	};

	drop if id == 718079 /* There is a problem with this this sensor. It never has speed information. As such, we drop it here */;
		
	**Some sensors have only missing data - 1202555 and 1211922;
	drop if id == 1202555 | id == 1211922;
			
	** Drop all with missing values (very few);
	drop if tot_flow == .;
	** Drop what PeMS calls likely errors - occupancy greater than .9;
	drop if avg_occ > .9;
	** Drop those without length data;
	drop if length == .;
	
	gen week = wofd(dofc(date));
	sort id week;

	** Generate length in the first observation – length can change over time as more sensors are added. As we use a balanced panel of sensors, we also maintain constant length. See text;
	by id: gen newlength = length[1];
	replace length = newlength;

	gen flow_by_length = tot_flow * length;

	collapse (sum) tot_flow flow_by_length 
	(mean) avg_occ avg_spd lanes length, by(id week) fast;

	** Generate balanced panel of weeks;
	format week %tw;
	egen minweek = min(week), by(id);
	egen maxweek = max(week), by(id);
	keep if minweek <= tw(2002w1) & maxweek >= tw(2007w52);
	drop minweek maxweek;	

	save `temp`region'';

};

use `temp03', clear;
append using `temp04';
append using `temp07';
append using `temp08';
append using `temp11';
append using `temp12';
			
sum id;
		
gen year = year(dofw(week));

** Sometimes entire weeks have exact same values, likely due to sensor breakdown or imputation techniques. Currently dropped;

sum;
duplicates tag id tot_flow avg_spd, g(duplication);
egen maxdupes = max(duplication), by(id);
drop if maxdupes ~= 0;
drop maxdupes;
sum;

keep id week tot_flow flow_by_length avg_occ avg_spd length;

order id week tot_flow flow_by_length avg_occ avg_spd length;

recast float tot_flow;
format tot_flow %12.0g;

** Keep only sensors seen for entire time frame;
egen total_obs = count(tot_flow), by(id);
tab total_obs;

keep if total_obs == 312;
drop total_obs;

sort id;

save `fileloc'/data/traffic_data/dta_files/weekly_traffic.dta, replace;

log close hourly_to_weekly_traffic;
